import os

import fitz  # 注意：PyMuPDF 的导入名称是 fitz

# 打开 PDF 文件
doc = fitz.open("劳动合同法.pdf")

# 获取 PDF 基本信息
print(f"页数: {len(doc)}")
print(f"标题: {doc.metadata.get('title', '无')}")
print(f"作者: {doc.metadata.get('author', '无')}")
print(f"创建日期: {doc.metadata.get('creationDate', '无')}")

text = ""

# 遍历每一页并提取文本
for page in doc:
    text += page.get_text()

print("提取的文本内容：")
print(text)

image_dir = "extracted_images"

# 创建保存图像的目录
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

# 遍历每一页提取图像
for page_num, page in enumerate(doc):
    images = page.get_images(full=True)
    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_data = base_image["image"]

        # 保存图像
        image_path = os.path.join(image_dir, f"page_{page_num + 1}_img_{img_index + 1}.png")
        with open(image_path, "wb") as f:
            f.write(image_data)
        print(f"已保存图像: {image_path}")

# 关闭文档
doc.close()
